# coding:utf8
import re

from scpy.xawesome_time import parse_time


def time_clean(time):  # 时间清洗函数
    return parse_time(time) if parse_time(time) else ''


def money_clean(money):
    if '万' in money:
        number = re.findall('\d+', money)
        number = float('.'.join(number)) * 10000
    else:
        number = re.findall('\d+', money)
        number = float('.'.join(number)) * 10000
    return number


def money_notclean(money):
    money = money.replace(",", "")
    number = re.findall('-*?\d+', money)
    if number:
        number = float('.'.join(number))
        return number
    else:
        return 0


def td(html, word):
    html = re.sub('<t[\s\S]+?>', '<td>', html)
    html1 = html.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '')
    print '@' * 100
    print html1
    content = re.findall('<td>%s</t[dh]><td>(.*?)</td>' % word, html1)
    try:
        content = re.sub('<.+?>', '', content[0])
        return content.replace('&nbsp;', '')
    except:
        return ''


def tr(table):
    html = re.sub('<tr.*?>', '<tr>', table)
    return html.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')


def td_clean(html):
    html = html.replace('<td>', '@@@@@')
    html = re.sub('<td.+?>', '<td>', html)
    html = html.replace('@@@@@', '<td>')
    return html.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')


def th(table, word):
    html = re.sub('<t[\s\S]+?>', '<td>', table)
    # html = re.sub('<th[\s\S]+?>','<th>',html)
    html1 = html.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '').replace('&nbsp;', '')
    print html1
    content = re.findall('<td>%s</th><td>(.+?)</td>' % word, html1)
    try:
        content = re.sub('<.+?>', '', content[0])
        return content.replace('&nbsp;', '')
    except:
        return ''


def table_clean(html, word):
    word = str(word)
    html = str(html)
    if word in html:
        tables = re.findall("<table.*?>.*?</table>", html, re.S)
        for table in tables:
            res = re.findall("<table.*?>.*?%s.*?</table>" % word, table, re.S)
            if res:
                return res[0]
    return ''


def basic(table):  # 基础信息
    table = table.replace('<br>', '')
    table = table.replace('<br/>', '')
    table = table.replace('<br />', '')
    print '#' * 100
    print table
    if '统一社会信用代码/注册号' in table:
        regNo = td(table, '统一社会信用代码/注册号')
    elif '注册号/统一社会信用代码' in table:
        regNo = td(table, '注册号/统一社会信用代码')
    else:
        regNo = td(table, '注册号')
    enterpriseName = td(table, '名称')
    frName = td(table, '法定代表人')
    if not frName:
        frName = td(table, '负责人')
    if not frName:
        frName = td(table, '投资人')
    if not frName:
        frName = td(table, '执行事务合伙人')
    if not frName:
        frName = td(table, '经营者')
    if '注册资本' in table:
        regCap = money_notclean(td(table, '注册资本'))
    elif '成员出资总额' in table:
        regCap = money_notclean(td(table, '注册资本'))
    else:
        regCap = ''
    regCapCur = td(table, '注册币种')
    if regCap:
        if regCapCur:
            pass
        else:
            if '美' in td(table, '注册资本'):
                regCapCur = '美元'
            else:
                regCapCur = '人民币'
    esDate = time_clean(td(table, '成立日期'))
    openFrom = time_clean(td(table, '经营期限自'))
    if not openFrom:
        openFrom = time_clean(td(table, '营业期限自'))
    if not openFrom:
        openFrom = time_clean(td(table, '合伙期限自'))
    openTo = time_clean(td(table, '经营期限至'))
    if not openTo:
        openTo = time_clean(td(table, '营业期限至'))
    if not openTo:
        openTo = time_clean(td(table, '合伙期限至'))
    auditDate = time_clean(td(table, '核准日期'))
    enterpriseType = td(table, '类型')
    enterpriseStatus = td(table, '登记状态')
    cancelDate = time_clean(td(table, '注销日期'))
    revokeDate = time_clean(td(table, '吊销日期'))
    address = td(table, '住所')
    if not address:
        address = td(table, '营业场所')
    if not address:
        address = td(table, '经营场所')
    if not address:
        address = td(table, '主要经营场所')
    abuItem = td(table, '许可经营项目')
    cbuItem = td(table, '一般经营项目')
    operateScope = td(table, '经营范围')
    operateScopeAndForm = td(table, '经营(业务)范围及方式')
    regOrg = td(table, '登记机关')
    ancheYear = time_clean(td(table, '最后年检年度'))
    ancheDate = time_clean(td(table, '最后年检日期'))

    industryPhyCode = td(table, '行业门类代码')
    industryPhyName = td(table, '行业门类名称')
    industryCode = td(table, '国民经济行业代码')
    industryName = td(table, '国民经济行业名称')
    recCap = td(table, '实收资本')
    oriRegNo = td(table, '原注册号')
    result = [{"regNo": regNo, 'SocialCreditIdentifier': '', "enterpriseName": enterpriseName, "frName": frName,
               "regCap": regCap, "regCapCur": regCapCur, "esDate": esDate, "openFrom": openFrom, "openTo": openTo,
               "enterpriseType": enterpriseType, "auditDate": auditDate, "enterpriseStatus": enterpriseStatus,
               "cancelDate": cancelDate, "revokeDate": revokeDate, "address": address, "abuItem": abuItem,
               "cbuItem": cbuItem, "operateScope": operateScope, "operateScopeAndForm": operateScopeAndForm,
               "regOrg": regOrg, "ancheYear": ancheYear, "ancheDate": ancheDate, "industryPhyCode": industryPhyCode,
               "industryPhyName": industryPhyName, "industryCode": industryCode, "industryName": industryName,
               "recCap": recCap, "oriRegNo": oriRegNo}]
    return result


def shareHolderList(table):
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    # print detail
    del detail[0]
    del detail[0]
    if detail:
        shareHolderList = []
        for i in detail:
            i = td_clean(i)
            # print i
            detail_td = re.findall("<td>(.*?)</td>", i)
            # print detail_td
            try:
                shareholderType = detail_td[0]
                shareholderName = detail_td[1]
                if len(detail_td) > 4:
                    shareHolderdetail = detail_td[4]
                    dic1 = {"shareholderType": shareholderType, "shareholderName": shareholderName,
                            "shareHolderdetail": shareHolderdetail}
                else:
                    dic1 = {"shareholderType": shareholderType, "shareholderName": shareholderName}
                shareHolderList.append(dic1)
            except:
                continue
        return shareHolderList
    else:
        shareHolderList = []
        return shareHolderList


def investment_information(table):  # 股东及出资信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        # print detail
        alterList = []
        for i in detail:
            i = td_clean(i)
            detail_td = re.findall("<td>(.*?)</td>", i)
            try:
                name = detail_td[0]
                investment_type = detail_td[1]
                contribute = detail_td[2]
                investment_mode = detail_td[3]
                contribute_date = time_clean(detail_td[4])

                paid_in = detail_td[5]
                alterList.append(paid_in)
                investment_mode = detail_td[6]
                alterList.append(investment_mode)
                return alterList
            except:
                continue
                # alterList.append(dic1)
                # return alterList
    else:
        alterList = []
        return ''
        # return alterList


def alterList(table):
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        # print detail
        alterList = []
        for i in detail:
            i = td_clean(i)
            detail_td = re.findall("<td>(.*?)</td>", i)
            try:
                altItem = detail_td[0]
                altBe = detail_td[1]
                altAf = detail_td[2]
                altDate = time_clean(detail_td[3])
                dic1 = {"altItem": altItem, "altBe": altBe, "altAf": altAf, "altDate": altDate}
            except:
                continue
            alterList.append(dic1)
        return alterList
    else:
        alterList = []
        return alterList


def personList(table):  # 企业主要管理人员
    table = td_clean(table)
    # print table
    data = re.findall('<td>\d+?</td><td>(.*?)</td><td>(.*?)</td>', table)
    print data
    if data:
        personList = []
        for i in data:
            dic1 = {"name": i[0], "position": i[1], "sex": ""}
            personList.append(dic1)
        return personList
    else:
        personList = []
        return personList


def filiationList(table):  # 分支机构信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        filiationList = []
        for i in detail:
            i = td_clean(i)
            detail_td = re.findall("<td>(.*?)</td>", i)
            try:
                seq_no = detail_td[0]
                reg_no = detail_td[1]
                name = detail_td[2]
                belong_org = detail_td[3]
                dic1 = {"seq_no": seq_no, "reg_no": reg_no, "name": name, "belong_org": belong_org}
                filiationList.append(dic1)
            except:
                continue
        return filiationList
    else:
        filiationList = []
        return filiationList


def liquidationList(table):
    charge = td(table, '清算组负责人')
    person = td(table, '清算组成员')
    liquidationList = []
    if bool(charge) == True or bool(person) == True:
        dic1 = {"charge": charge, "person": person}
        liquidationList.append(dic1)
        return liquidationList
    else:
        return liquidationList


def abnormalOperation(table):  # 经营异常
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    # print detail
    del detail[0]
    del detail[0]
    if detail:
        abnormalOperation = []
        for i in detail:
            i = td_clean(i)
            # print i
            detail_td = re.findall("<td>(.*?)</td>", i)
            # print len(detail_td)
            # for j in detail_td:
            #   print j
            try:
                specauseno = detail_td[0]
                specause = detail_td[1]
                abntime = time_clean(detail_td[2])
                recause = detail_td[3]
                retime = time_clean(detail_td[4])
                decorg = detail_td[5]
                dic1 = {"specauseno": specauseno, "specause": specause, "abntime": abntime, "recause": recause,
                        "retime": retime, "decorg": decorg}
            except:
                continue
            abnormalOperation.append(dic1)
        return abnormalOperation
    else:
        abnormalOperation = []
        return abnormalOperation


def checkMessage(table):  # 抽查检查信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        checkMessage = []
        for i in detail:
            i = td_clean(i)
            # print i
            detail_td = re.findall("<td>(.*?)</td>", i)
            # print len(detail_td)
            # for j in detail_td:
            #   print j
            try:
                seq_no = detail_td[0]
                institution = detail_td[1]
                check_type = detail_td[2]
                check_date = time_clean(detail_td[3])
                check_result = detail_td[4]
                dic1 = {"seq_no": seq_no, "institution": institution, "check_type": check_type,
                        "check_date": check_date, "check_result": check_result}
            except:
                continue
            checkMessage.append(dic1)
        return checkMessage
    else:
        checkMessage = []
        return checkMessage


def report_basic(table):  # 年报的基础信息
    table = table.replace('<br>', '')
    if '统一社会信用代码/注册号' in table:
        regNo = td(table, '统一社会信用代码/注册号')
    elif '注册号/统一社会信用代码' in table:
        regNo = td(table, '注册号/统一社会信用代码')
    elif '营业执照注册号' in table:
        regNo = td(table, '营业执照注册号')
    elif '统一社会信用代码' in table:
        regNo = td(table, '统一社会信用代码')
    else:
        regNo = td(table, '注册号')

    if '企业联系电话' in table:
        phone = td(table, '企业联系电话')
    else:
        phone = td(table, '联系电话')

    if '企业电子邮箱':
        email = td(table, '企业电子邮箱')
    else:
        email = td(table, '电子邮箱')
    zipcode = td(table, '邮政编码')
    enterpriseStatus = td(table, '企业经营状态')
    haveWebsite = td(table, '是否有网站或网店')
    if not haveWebsite:
        haveWebsite = td(table, '是否有网站或网点')
    buyEquity = td(table, '企业是否有投资信息或购买其他公司股权')
    if not buyEquity:
        buyEquity = td(table, '企业是否有对外投资设立企业信息')
    equityTransfer = td(table, '有限责任公司本年度是否发生股东股权转让')
    address = td(table, '企业通信地址')

    employeeCount = td(table, '从业人数')
    if not employeeCount:
        employeeCount = td(table, '成员人数')

    # baseInfo ＝ {"regNo":regNo,"phone":phone,"email":email,"zipcode":zipcode,"enterpriseStatus":enterpriseStatus,"haveWebsite":haveWebsite,"buyEquity":buyEquity,"equityTransfer":equityTransfer,"address":address,"employeeCount":employeeCount}
    baseInfo = {"regNo": regNo, "phone": phone, "email": email, "zipcode": zipcode,
                "enterpriseStatus": enterpriseStatus, "haveWebsite": haveWebsite, "buyEquity": buyEquity,
                "equityTransfer": equityTransfer, "address": address, "employeeCount": employeeCount}
    return baseInfo


def report_website(table):  # 年报的网站
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        website = {}
        for i in detail:
            i = td_clean(i)
            detail_td = re.findall("<td>(.*?)</td>", i)
            try:
                type1 = detail_td[0]
                name = detail_td[1]
                link = detail_td[2]
                if type1 or name or link:
                    website = {"type": type1, "name": name, "link": link}
            except:
                continue
        return website
    else:
        website = {}
        return website


def report_assetsInfo(table):  # 企业资产状况信息
    generalAssets = td(table, '资产总额')
    ownersEequity = td(table, '所有者权益合计')
    revenue = td(table, '营业总收入')
    profit = td(table, '利润总额')
    mainRevenue = td(table, '营业总收入中主营业务收入')
    if not mainRevenue:
        mainRevenue = td(table, '其中：主营业务收入')
    netProfit = td(table, '净利润')
    taxPayment = td(table, '纳税总额')
    liability = td(table, '负债总额')
    ditc = {"generalAssets": generalAssets, "ownersEequity": ownersEequity, "revenue": revenue, "profit": profit,
            "mainRevenue": mainRevenue, "netProfit": netProfit, "taxPayment": taxPayment, "liability": liability}
    return ditc


def report_investorInformations(table):  # 股东及出资信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    if detail:
        investorInformations = []
        for i in detail:
            i = td_clean(i)
            detail_td = re.findall("<td>(.*?)</td>", i)
            try:
                shareholderName = detail_td[0]
                subConam = money_notclean(detail_td[1])
                subConDate = time_clean(detail_td[2])  # 时间
                subConType = detail_td[3]
                paidConMoney = money_notclean(detail_td[4])
                paidTime = time_clean(detail_td[5])
                paidType = detail_td[6]
            except:
                continue
            if shareholderName or subConam or subConDate or subConType or paidConMoney or paidTime or paidType:
                dict1 = {"shareholderName": shareholderName, "subConam": subConam, "subConDate": subConDate,
                         "subConType": subConType, "paidConMoney": paidConMoney, "paidTime": paidTime,
                         "paidType": paidType}
                investorInformations.append(dict1)
        return investorInformations
    else:
        investorInformations = []
        return investorInformations


def report_equityChangeInformations(table):  # 股权变更信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    print detail
    if detail:
        equityChangeInformations = []
        for i in detail:
            i = td_clean(i)
            try:
                detail_td = re.findall("<td>(.*?)</td>", i)
                shareholderName = detail_td[0]
                equityBefore = detail_td[1]
                equityAfter = detail_td[2]
                time = time_clean(detail_td[3])  # 时间
                if shareholderName or equityAfter or equityBefore or time:
                    dict1 = {"shareholderName": shareholderName, "equityBefore": equityBefore,
                             "equityAfter": equityAfter, "time": time}
                    equityChangeInformations.append(dict1)
            except:
                continue
        return equityChangeInformations
    else:
        equityChangeInformations = []
        return equityChangeInformations


def report_changeRecords(table):  # 修改记录
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    print detail
    if detail:
        changeRecords = []
        for i in detail:
            i = td_clean(i)
            try:
                detail_td = re.findall("<td>(.*?)</td>", i)
                reg_no = detail_td[0]
                changedItem = detail_td[1]
                beforeChange = detail_td[2]
                afterChange = detail_td[3]
                time = time_clean(detail_td[4])  # 时间
                if changedItem or beforeChange or afterChange:
                    dict1 = {"changedItem": changedItem, "beforeChange": beforeChange, "afterChange": afterChange,
                             "time": time}
                    changeRecords.append(dict1)
            except:
                continue
        return changeRecords
    else:
        changeRecords = []
        return changeRecords


def report_entinvItem(table):  # 对外投资信息
    table = tr(table)
    detail = re.findall('<tr>.*?</tr>', table)
    del detail[0]
    del detail[0]
    print detail
    entinvItem = []
    if detail:
        for i in detail:
            i = td_clean(i)
            try:
                detail_td = re.findall("<td>(.*?)</td>", i)
                entName = detail_td[0]
                reg_no = detail_td[1]
                if entName or reg_no:
                    dict1 = {"entName": entName, "entType": "", "fundedRatio": "", "currency": "", "entStatus": "",
                             "canDate": "", "esDate": "", "regOrg": "", "regCapcur": "", "regCap": "", "revDate": "",
                             "name": "", "subConam": "", "regNo": reg_no, }
                    entinvItem.append(dict1)
            except:
                continue
    return entinvItem


# def temp(table):
def index(word, table):
    if word == '基本信息':
        return basic(table)
    if word == '股东信息' or word == '股东（发起人）信息':
        return shareHolderList(table)
    if word == '变更信息':
        return alterList(table)
    if word == '主要人员信息':
        return personList(table)
    if word == '分支机构信息':
        return filiationList(table)
    if word == '清算信息':
        return liquidationList(table)
    if word == '经营异常信息':
        return abnormalOperation(table)
    if word == '抽查检查信息':
        return checkMessage(table)


def report_index(word, table):
    if word == '企业基本信息':
        return report_basic(table)
    if word == '网站或网店信息':
        return report_website(table)
    if word == '企业资产状况信息':
        return report_assetsInfo(table)
    if word == '股东及出资信息':
        return report_investorInformations(table)
    if word == '股权变更信息':
        return report_equityChangeInformations(table)
    if word == '修改记录':
        return report_changeRecords(table)
    if word == '对外投资信息':
        return report_entinvItem(table)


if __name__ == '__main__':
    import json

    html = '<colwidth="20%"/><colwidth="30%"/><colwidth="20%"/><colwidth="30%"/><td><td>基本信息</th></tr><td><td>统一社会信用代码/注册号</th><td>350782100056470</td><td>名称</th><td>武夷山市剑峰岩茶厂</td></tr><td><td>类型</th><td></td><td>投资人</th><td>江剑锋</td></tr><td><td>住所</th><td>福建省南平市武夷山市景区公馆村39号</td></tr><td><td>经营范围</th><td>一般经营项目：毛茶制售。(以上经营范围涉及许可经营项目的，应在取得有关部门的许可后方可经营)</td></tr><td><td>登记机关</th><td>福建省武夷山市市场监督管理局</td><td>核准日期</th><td>2014年10月24日</td></tr><td><td>成立日期</th><td>2013年11月22日</td><td>登记状态</th><td>存续（在营、开业、在册）</td></tr></table>'
    re.findall('核准日期')
    print json.dumps(basic(html), ensure_ascii=False, indent=4)
